print(Sys.time())

options(stringsAsFactors=F)

suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))

load("data/russia_ids_replicate_psrm.RData")


ira <- list()
for (i in 1:13) {
ira[[i]] <- read_csv(
                  paste0("data/IRAhandle_tweets_",i,".csv"),
                  col_types = cols(.default = col_skip(), account_category = col_character(), tweet_id = col_character(), followers = col_integer(), external_author_id = col_character())
              )
}
ira <- data.table::rbindlist(ira)


ira <- subset(
    ira, account_category %in% c("RightTroll","LeftTroll","HashtagGamer","NewsFeed")
)

ira <- inner_join(
    ira %>% mutate(tweetid = tweet_id),
    russia_ids,
    by="tweetid"
)

## set.seed(98765321) # pilot sampling
set.seed(123456789)

## as.Date(as.POSIXct(tweet_time, "%Y-%d-%m %H:%M"))
## same

ira_right_s <- subset(
    ira,
    account_category=="RightTroll"
    & substr(tweet_time, 1, 10) >= "2016-06-01"
    & substr(tweet_time, 1, 10) <= "2016-11-08"
) %>%
    sample_n(size=450)

ira_left_s <- subset(
    ira, account_category=="LeftTroll"
    & substr(tweet_time, 1, 10) >= "2016-06-01"
    & substr(tweet_time, 1, 10) <= "2016-11-08"
) %>%
    sample_n(size=450)

write.csv(
    ira_right_s[,c("tweet_text", "tweet_id")],
    file="data/russian_right_trolls_sample450_final_with_ids_replicate_psrm.csv",
    row.names=F,
    fileEncoding="UTF-8"
)

write.csv(
    ira_left_s[,c("tweet_text", "tweet_id")],
    file="data/russian_left_trolls_sample450_final_with_ids_replicate_psrm.csv",
    row.names=F,
    fileEncoding="UTF-8"
)

print(Sys.time())
